Question: how many connected components does the k-nearest neighbor graph have for a collection of handwritten digit images?
In [1]:
from sklearn import neighbors
In [2]:
from sklearn import datasets
In [3]:
import pylab as pl
%matplotlib inline
pl.rcParams['font.family'] = 'serif'
In [4]:
data = datasets.load_digits()
In [5]:
X = data.data
Y = data.target
In [6]:
from sklearn.decomposition import PCA
pca = PCA(n_components=64,whiten=True)
In [7]:
import networkx as nx
print(nx.__version__)
In [8]:
X_ = pca.fit_transform(X)
In [9]:
epsneighborhood = neighbors.graph.radius_neighbors_graph(X_,9.0)
G = nx.Graph(epsneighborhood)
nx.number_connected_components(G)
Out[9]:
In [10]:
kneighbors = neighbors.graph.kneighbors_graph(X_,3)#,mode='distance')
G = nx.Graph(kneighbors)
nx.number_connected_components(G)
Out[10]:
Okay, interesting
In [11]:
cc = [c for c in nx.connected_components(G)]
In [12]:
for c in cc:
print(Y[c])
In [13]:
pl.imshow(data.images[cc[0][0]],cmap='gray')
Out[13]:
In [14]:
sum(kneighbors.toarray()[1])
Out[14]:
What do the locally linear maps between tangent spaces look like when learned by Isomap or LLE (or LDA), for example?
In [15]:
from sklearn.lda import LDA
In [16]:
lda= LDA(n_components=2)
In [17]:
pca = PCA(n_components=8)
X_ = pca.fit_transform(X)
X_.shape
Out[17]:
How well can we do with a linear embedding in the best case, when we know the class memberships ahead of time?
In [18]:
lda.fit(X_,Y)
Out[18]:
In [19]:
lda_map = lda.transform(X_)
In [20]:
pl.scatter(lda_map[:,0],lda_map[:,1],c=Y,linewidth=0)
pl.axis('off')
pl.colorbar();
In [21]:
lo_d = lda_map[1] - lda_map[0]
lo_d
Out[21]:
In [22]:
hi_d = X_[1] - X_[0]
hi_d
Out[22]:
In [23]:
from sklearn.manifold import Isomap
iso = Isomap(n_neighbors=5)
iso.fit(X_)
iso_map = iso.transform(X_)
In [24]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=5)
lle.fit(X_)
lle_map = lle.transform(X_)
lle_map.shape
Out[24]:
In [26]:
import numpy as np
import numpy.random as npr
y_dif_iso = np.zeros((len(X_),len(lo_d)))
y_dif_lda = np.zeros((len(X_),len(lo_d)))
x_dif = np.zeros((len(X_),len(hi_d)))
i = 0
for j in range(len(X_)):
if j != i:
y_dif_lda[j] = lda_map[j] - lda_map[i]
y_dif_iso[j] = iso_map[j] - iso_map[i]
x_dif[j] = X_[j] - X_[i]
In [27]:
x_dif_pinv = np.linalg.pinv(x_dif)
x_dif_pinv.shape
Out[27]:
In [28]:
C_lda = y_dif_lda.T.dot(x_dif_pinv.T)
C_lda.shape
Out[28]:
In [29]:
C_iso = y_dif_iso.T.dot(x_dif_pinv.T)
C_iso.shape
Out[29]:
In [30]:
C_lda.dot(x_dif.T).shape
Out[30]:
In [31]:
# works well for a globally linear mapping
np.mean(abs(y_dif_lda - C_lda.dot(x_dif.T).T),0)
Out[31]:
In [32]:
# doesn't work so well for a highly nonlinear mapping
np.mean(abs(y_dif_iso - C_iso.dot(x_dif.T).T),0)
Out[32]:
In [33]:
pl.scatter(iso_map[:,0],iso_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
pl.title("Isomap")
Out[33]:
In [34]:
pl.scatter(lle_map[:,0],lle_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
Hmm that doesn't look useful at all
In [35]:
%timeit x_dif_pinv = np.linalg.pinv(x_dif)
In [36]:
x_dif.shape
Out[36]:
In [38]:
import numpy as np
import numpy.random as npr
#C = np.ones((len(lo_d),len(hi_d)))
C = npr.randn(len(lo_d),len(hi_d))
C.shape
Out[38]:
In [39]:
C.dot(x_dif.T)
Out[39]:
In [40]:
hi_d.shape
Out[40]:
So that's cool, that's for one point, evaluated w.r.t. all other points in the dataset. Next up: compute w.r.t. only $k$-nearest neighbors
In [41]:
k=10
knearest = neighbors.kneighbors_graph(X_,k).toarray()
knearest = knearest - np.diag(np.ones(len(X_))) # knearest[i,i] = 0
In [42]:
knearest_sparse = neighbors.kneighbors_graph(X_,k)
In [43]:
row = knearest_sparse.getrow(0)
In [44]:
row.indices
Out[44]:
In [45]:
knearest_sparse[0].indices
Out[45]:
In [46]:
knearest[0].sum()
Out[46]:
In [47]:
i = 0
for ind,j in enumerate(knearest_sparse[i].indices):
if j != i:
print(ind,j)
In [48]:
def construct_C(X,Y,k=10):
''' X = high-dimensional feature-vectors,
Y = low-dimensional feature-vectors,
k = number of neighbors to consider'''
d_x = X.shape[1]
d_y = Y.shape[1]
knearest_sparse = neighbors.kneighbors_graph(X,k)
#Y_reconstructed = np.zeros(Y.shape)
error = np.empty(len(X),dtype=object)
C = np.empty(len(X),dtype=object)
for i in range(len(X)):
y_dif = np.zeros((k,d_y))
x_dif = np.zeros((k,d_x))
for ind,j in enumerate(knearest_sparse[i].indices):
if j != i:
#y_dif_lda[j] = lda_map[j] - lda_map[i]
#y_dif_iso[ind] = iso_map[j] - iso_map[i]
y_dif[ind] = Y[j] - Y[i]
x_dif[ind] = X[j] - X[i]
x_dif_pinv = np.linalg.pinv(x_dif)
C_ = y_dif.T.dot(x_dif_pinv.T)
error[i] = y_dif - C_.dot(x_dif.T).T
C[i] = C_
#Y_reconstructed[i] = C_.dot(x_dif)
return C,error
In [49]:
%%timeit
knearest_sparse = neighbors.kneighbors_graph(X,15)
In [50]:
C,error = construct_C(X_,iso_map,k=25)
In [51]:
%%timeit
C,error = construct_C(X_,iso_map,k=25)
About half of the computational cost here is in computing the neighborhood graph in the input space-- only need to do this once though, for future reference...
In [52]:
C[0].shape
Out[52]:
In [53]:
average_error = np.array([np.mean(np.sqrt(sum(abs(e)**2))) for e in error])
In [54]:
error_stdev = np.array([np.std(np.sqrt(sum(abs(e)**2))) for e in error])
hmm so in an Isomap embedding the largest deviations from locally linear approximation occur for points far away from their cluster cores, interesting
In [55]:
pl.scatter(iso_map[:,0],iso_map[:,1],c=np.log(average_error),
alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
In [56]:
pl.hist(average_error,bins=50);
pl.xlabel('Local reconstruction error')
pl.ylabel('Probability density')
Out[56]:
In [57]:
pl.hist(np.log(average_error),bins=50);
pl.xlabel('log(local reconstruction error)')
pl.ylabel('Probability density')
Out[57]:
In [58]:
pl.scatter(iso_map[:,0],iso_map[:,1],c=error_stdev,
alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
In [59]:
pl.hist(error_stdev,bins=50);
Here I've plotted the points in the locations provided by LDA, colored by the "strain" they felt in the isomap embedding
In [60]:
pl.scatter(lda_map[:,0],lda_map[:,1],c=np.log(average_error),alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
Hmm how similar are all the maps in C?
In [61]:
map_dim = C[0].shape[0]*C[0].shape[1]
c = C[0].reshape((map_dim,1))
In [62]:
maps = np.hstack([c.reshape((map_dim,1)) for c in C]).T
maps.shape
Out[62]:
In [63]:
maps_rand = npr.randn(*maps.shape)
In [64]:
pca = PCA()
pca.fit(maps)
pca_r = PCA()
pca_r.fit(maps_rand)
Out[64]:
In [65]:
# comparison to random expectation
pl.plot(range(1,maps.shape[1]+1),pca.explained_variance_ratio_,label='observed')
pl.plot(range(1,maps.shape[1]+1),pca_r.explained_variance_ratio_,label='random')
pl.ylim((0,max(pca.explained_variance_ratio_)))
pl.legend(loc='best')
pl.xlim((1,maps.shape[1]+1))
pl.xlabel('Component')
pl.ylabel('Explained variance')
Out[65]:
Now this is getting meta-- embedding the map vectors themselves, to visualize...
In [66]:
iso = Isomap(n_neighbors=5)
iso.fit(maps)
maps_map = iso.transform(maps)
In [67]:
pl.scatter(PCA(2).fit_transform(maps)[:,0],PCA(2).fit_transform(maps)[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
In [68]:
pl.scatter(maps_map[:,0],maps_map[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
Hmm instead of examining these for maps generated by existing algorithms, let's try instead when we have a "ground-truth" low-dimensional map?
Hmm it would be neat to examine how "dependent" locally linear approximations are for representations learned by autoencoders over time
Hmm I wonder what would happen if you used a stack of nonlinear projections to gradually reduce dimensionality. Let's try it for spectral embedding!
In [69]:
X_ = PCA().fit_transform(X)
X_.shape
Out[69]:
In [70]:
from sklearn.manifold import SpectralEmbedding
In [71]:
se = SpectralEmbedding(n_neighbors=48)
In [72]:
proj = se.fit_transform(X_)
se.n_neighbors_
Out[72]:
In [73]:
se.n_neighbors_
Out[73]:
In [74]:
pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
In [76]:
def one_nn_baseline(X,Y):
one_nn = neighbors.kneighbors_graph(X,2)
inds = np.zeros(len(X),dtype=int)
for i in range(len(X)):
inds[i] = [ind for ind in one_nn[i].indices if ind != i][0]
preds = Y[inds]
return 1.0*sum(preds==Y) / len(Y)
In [77]:
one_nn_baseline(proj,Y)
Out[77]:
In [78]:
dim = X_.shape[1]
dim
Out[78]:
In [79]:
proj = X_.copy()
In [80]:
projs = []
projs.append((dim,proj))
In [81]:
# alternate scale?
scale_neighbors = lambda x : int(6**(1+(x / 31.0)))
In [82]:
pl.plot(range(2,dim),[scale_neighbors(x) for x in range(2,dim)])
Out[82]:
In [83]:
step = 10
target = 2
while dim > target:
comp = max(dim - step,target)
#se = SpectralEmbedding(comp,n_neighbors=3*dim)
se = SpectralEmbedding(comp,n_neighbors=scale_neighbors(dim))
projs.append((dim,se.fit_transform(projs[-1][1])))
dim -= step
print(dim)
In [84]:
for dim,proj in projs:
pl.figure()
pl.title("Input dimensionality: {0}".format(dim))
pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
In [85]:
def iterative_spectral_embedding(X,step=10,target=2,dim_mult=3):
dim = X.shape[1]
projs = []
projs.append(X.copy())
while dim > target:
comp = max(dim - step,target)
# not sure how to select the number of neighbors here
# large k work well when the input dimensionality is high
# but fail for low input dimensionality, and vice-versa
# for low k
se = SpectralEmbedding(comp,n_neighbors=dim_mult*dim)
projs.append(se.fit_transform(projs[-1]))
dim -= step
print(dim)
return projs
In [86]:
def plot_projs(projs):
for proj in projs:
pl.figure()
pl.scatter(proj[:,0],proj[:,1],c=Y,alpha=0.5,linewidth=0)
pl.axis('off')
pl.colorbar();
In [87]:
plot_projs(iterative_spectral_embedding(X_))
In [98]:
plot_projs(iterative_spectral_embedding(X_,dim_mult=2))
In [89]:
plot_projs(iterative_spectral_embedding(X_,dim_mult=5))
In [90]:
cray_projs = iterative_spectral_embedding(X_,step=24,dim_mult=2)
plot_projs(cray_projs)
In [91]:
projs_24_3 = iterative_spectral_embedding(X_,step=24,dim_mult=3)
plot_projs(projs_24_3)
In [92]:
one_nn_baseline(projs_24_3[-1],Y)
Out[92]:
In [93]:
projs_24_4 = iterative_spectral_embedding(X_,step=24,dim_mult=4)
plot_projs(projs_24_4)
In [94]:
one_nn_baseline(projs_24_4[-1],Y)
Out[94]:
In [95]:
projs = iterative_spectral_embedding(X_,step=24,dim_mult=5)
plot_projs(projs)
In [96]:
one_nn_baseline(lda_map,Y)
Out[96]:
In [97]:
one_nn_baseline(PCA(2).fit_transform(X),Y)
Out[97]:
In [99]:
from sklearn.decomposition import kernel_pca
kpca = kernel_pca.KernelPCA(2,kernel='sigmoid')
one_nn_baseline(kpca.fit_transform(X),Y)
Out[99]:
In [100]:
one_nn_baseline(iso_map,Y)
Out[100]:
In [101]:
se_map = SpectralEmbedding().fit_transform(X_)
In [102]:
one_nn_baseline(se_map,Y)
Out[102]:
In [103]:
one_nn_baseline(cray_projs[-1],Y)
Out[103]:
In [104]:
one_nn_baseline(projs[-1],Y)
Out[104]:
In [105]:
one_nn_baseline(projs[0][:,:2],Y)
Out[105]:
In [106]:
kpca = kernel_pca.KernelPCA()
In [107]:
kpca.fit(X[:,:2])
Out[107]:
In [108]:
kpca.fit(X[:,:20])
Out[108]:
In [109]:
def iterative_kpca(X,step=10,target=2,dim_mult=3):
dim = X.shape[1]
projs = []
projs.append(X.copy())
while dim > target:
comp = max(dim - step,target)
# not sure how to select the number of neighbors here
# large k work well when the input dimensionality is high
# but fail for low input dimensionality, and vice-versa
# for low k
kpca = kernel_pca.KernelPCA(dim,'poly')#'rbf',gamma=1.0/dim)
#se = SpectralEmbedding(comp,n_neighbors=dim_mult*dim)
projs.append(kpca.fit_transform(projs[-1]))
dim -= step
print(dim)
return projs
In [110]:
projs_k = iterative_kpca(X,step=16)
In [111]:
[one_nn_baseline(proj[:,:2],Y) for proj in projs_k]
Out[111]:
In [112]:
plot_projs(iterative_spectral_embedding(X_,step=24,dim_mult=6))
In [113]:
plot_projs(iterative_spectral_embedding(X_,step=16,dim_mult=5))
And now some Bayesian manifold learning tinkering
In [114]:
X_reduced = PCA(8).fit_transform(X)[:200,:]
X_reduced.shape
Out[114]:
In [115]:
Y_reduced = Isomap().fit_transform(X_reduced)
Y_reduced.shape
Out[115]:
In [116]:
C,error = construct_C(X_reduced,Y_reduced,k=10)
In [117]:
np.hstack(C).shape
Out[117]:
In [118]:
C_ = np.hstack(C)
In [119]:
U = C_.dot(C_.T)
U.shape
Out[119]:
In [120]:
k = 15
G = neighbors.kneighbors_graph(X_reduced,k).toarray()
L = np.eye(len(G))*k - G
In [121]:
I = np.eye(2)
In [122]:
from scipy import linalg
In [123]:
omega_inv = linalg.kron(L,I)
In [124]:
omega_inv.shape
Out[124]:
In [125]:
omega = np.linalg.inv(omega_inv)
In [126]:
covariance_c_prior = linalg.kron(omega,U)
covariance_c_prior.shape
Out[126]:
In [127]:
# likelihood
# etc.
In [128]:
d_x = 8
d_y = 2
In [129]:
gamma = 0.1
V_inv = np.eye(d_y)*gamma
V = np.linalg.inv(V_inv)
V.shape
Out[129]:
In [130]:
G.shape
Out[130]:
In [131]:
i = 0
j = 0
n = len(X_reduced)
A_inv = np.empty((n,n),dtype=object)
from time import time
t0 = time()
for i in range(n):
if i%25 == 0:
print(i)
for j in range(n):
if i == j:
A_inv[i,j] = -1 * (C[i].T.dot(V_inv).dot(C[i]) +
C[j].T.dot(V_inv).dot(C[j]))
if i != j:
A_inv[i,j] = np.zeros((d_x,d_x))
for k in range(n):
A_inv[i,j] += G[i,k]*(C[k].T.dot(V_inv).dot(C[k]) +
C[i].T.dot(V_inv).dot(C[i]))
t1 = time()
print(t1 - t0)
In [132]:
def compute_A_inv(X,C,G):
n = len(X)
A_inv = np.empty((n,n),dtype=object)
t0 = time()
for i in range(n):
if i%10 == 0:
print(i)
for j in range(n):
if i == j:
A_inv[i,j] = -1 * (C[i].T.dot(V_inv).dot(C[i]) +
C[j].T.dot(V_inv).dot(C[j]))
if i != j:
A_inv[i,j] = np.zeros((d_x,d_x))
for k in range(n):
A_inv[i,j] += G[i,k]*(C[k].T.dot(V_inv).dot(C[k]) +
C[i].T.dot(V_inv).dot(C[i]))
t1 = time()
print(t1 - t0)
return A_inv
Constructing this matrix is ultra slow but could be trivially parallelized?
In [133]:
b = np.empty(n,dtype=object)
In [134]:
for i in range(n):
b[i] = np.zeros(d_x)
for j in range(n):
b[i] -= G[i,j]*(C[j]+C[i]).T.dot(V_inv).dot(Y_reduced[j] - Y_reduced[i])
In [136]:
k = 0
C[k].T.dot(V_inv).dot(C[k]).shape
Out[136]:
In [139]:
# A_inv_ij in R^{d_x * d_x}
A_inv[i,j].shape
Out[139]:
In [638]:
# E step
In [639]:
# M step
In [147]:
X_.shape
Out[147]:
Approximation
In [142]:
from sklearn import kernel_approximation
from sklearn.kernel_approximation import RBFSampler
In [144]:
nystrom = kernel_approximation.Nystroem()
In [149]:
%timeit nystrom.fit(X_)
In [150]:
nystrom.fit(X_)
Out[150]:
In [151]:
nystrom.transform(X_).shape
Out[151]:
In [ ]: